import requests

from spider_utils.csv_utils import img_data_to_csv
from spider_utils.img_utils import get_img_mes
from bs4 import BeautifulSoup

from spider_utils.tag_utils import get_tags

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/89.0.4389.114 Safari/537.36 '
}

session = requests.session()
# 81 [1-21]is ok
min_page = 21
max_page = 41
baseURL = 'http://acg17.com/tag/pixiv-painter/'

# 源站点
src_site = 'http://acg17.com'

# 获取每个图片url等信息
img_data_list = []

i = 1

for page in range(min_page, max_page+1):
# for page in range(1, 2):

    print(i)
    i = i+1

    r = session.request('GET', baseURL+'/page/'+str(page)+'/', headers=headers)

    soup = BeautifulSoup(r.text, 'html.parser')

    h_tags = soup.select('.post-box-title')

    list_urls = []

    for h_tag in h_tags:
        a_tag = h_tag.contents[1]
        list_urls.append(a_tag['href'])

    for list_url in list_urls:
        r = session.request('GET', list_url, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        # 获取标签
        tags = soup.select('meta[name="keywords"]')
        tags = get_tags('.'.join(tags[0]['content'].split(',')))
        tags = list(set(tags))
        # 遍历所有图片
        img_tags = soup.select('.entry img')

        for img_tag in img_tags:
            try:
                img_data = {}
                img_data['src_site'] = src_site
                img_data['tags'] = ','.join(tags)
                img_data['src'] = img_tag['src']
                img_data['outer_net'] = 0
                # 获取其他的一些数据
                orther_mes = get_img_mes(img_tag['src'])
                img_data['colors'] = orther_mes['colors']
                img_data['size'] = orther_mes['size']
                img_data_list.append(img_data)
            except:
                print('err')
                continue

# print(img_data_list)
img_data_to_csv(img_data_list, img_data_list[0].keys(), 'data/acg17_painters_data_page21-41.csv')
